Single sample T test
·
Based on: Handbook of Parametric and Nonparametric Statistical Procedures, 5th edition, Chapter 2
using Random
Random.seed!(783376894512)
using Distributions
using Intervals
using Plots
n_sim = 1_000_000 # number of simulations to verify theoretical distributions
True model
n = 4
σₜ = 2.0
μₜ = 3.0
Mₜ = Product([Normal(μₜ, σₜ) for _ in 1:n]) # assumed known to be Normal and i.i.d.
generate_data(Mₜ) = rand(Mₜ)
Hypothesis testing
function test_statistic(y, μ₀)
n = length(y)
μ̂ = sum(y) / n
σ̂² = sum((y .- μ̂) .^ 2) / (n - 1)
return (μ̂ - μ₀) / √(σ̂² / n)
end
Null Hypothesis is true
μ₀ = 3.0
μₜ == μ₀
simulated_statistics = [test_statistic(generate_data(Mₜ), μ₀) for _ in 1:n_sim]
distribution_under_H₀ = t -> pdf(TDist(n - 1), t)
Code
histogram(simulated_statistics; normalize=true, label="Simulated")
tmin = mean(simulated_statistics) -3*std(simulated_statistics)
tmax = mean(simulated_statistics) +3*std(simulated_statistics)
t_plot = tmin:((tmax - tmin) / 1000):tmax
plot!(distribution_under_H₀, t_plot; lw=5, label="Theoretical")
plot!(;
title="Distribution of single sample T test statistic, \n when Null Hypothesis is correct",
xlimits=(tmin, tmax),
xlabel="t",
ylabel="pdf",
grid=false,
legendfontsize=12,
tickfontsize=12,
guidefontsize=16,
)
Null Hypothesis is true, but assumptions don’t hold
Observations are non-normal
M_non_normal = Product([MixtureModel([Normal(μₜ+sqrt(3.99), 0.1),
Normal(μₜ-sqrt(3.99), 0.1),], [0.5, 0.5]) for _ in 1:n])
all(mean(M_non_normal) .== μₜ)
all(sqrt.(var(M_non_normal)) .≈ σₜ)
simulated_statistics = [test_statistic(generate_data(M_non_normal), μ₀) for _ in 1:n_sim]
Code
histogram(simulated_statistics; normalize=true, label="Simulated")
tmin = mean(simulated_statistics) -3*std(simulated_statistics)
tmax = mean(simulated_statistics) +3*std(simulated_statistics)
t_plot = tmin:((tmax - tmin) / 1000):tmax
plot!(distribution_under_H₀, t_plot; lw=5, label="Theoretical")
plot!(;
title="Distribution of single sample T test statistic, \n under H₀, but non-normal observations",
xlimits=(tmin, tmax),
xlabel="t",
ylabel="pdf",
grid=false,
legendfontsize=12,
tickfontsize=12,
guidefontsize=16,
)
Observations are non-independent
M_non_independent = MvNormal([μₜ, μₜ, μₜ, μₜ, μₜ], [σₜ^2 0.5*σₜ^2 0 0 0;
0.5*σₜ^2 σₜ^2 0 0 0;
0 0 σₜ^2 0 0;
0 0 0 σₜ^2 0;
0 0 0 0 σₜ^2])
all(mean(M_non_independent) .== μₜ)
simulated_statistics = [test_statistic(generate_data(M_non_independent), μ₀) for _ in 1:n_sim]
Code
histogram(simulated_statistics; normalize=true, label="Simulated")
tmin = mean(simulated_statistics) -3*std(simulated_statistics)
tmax = mean(simulated_statistics) +3*std(simulated_statistics)
t_plot = tmin:((tmax - tmin) / 1000):tmax
plot!(distribution_under_H₀, t_plot; lw=5, label="Theoretical")
plot!(;
title="Distribution of single sample T test statistic, \n under H₀, but non-independent observations",
xlimits=(tmin, tmax),
xlabel="t",
ylabel="pdf",
grid=false,
legendfontsize=12,
tickfontsize=12,
guidefontsize=16,
)
Observations are non-identical
M_non_identical = Product([Normal(μₜ, i*σₜ) for i in 1:n])
all(mean(M_non_identical) .== μₜ)
simulated_statistics = [test_statistic(generate_data(M_non_identical), μ₀) for _ in 1:n_sim]
Code
histogram(simulated_statistics; normalize=true, label="Simulated")
tmin = mean(simulated_statistics) -3*std(simulated_statistics)
tmax = mean(simulated_statistics) +3*std(simulated_statistics)
t_plot = tmin:((tmax - tmin) / 1000):tmax
plot!(distribution_under_H₀, t_plot; lw=5, label="Theoretical")
plot!(;
title="Distribution of single sample T test statistic, \n under H₀, but non-identical observations",
xlimits=(tmin, tmax),
xlabel="t",
ylabel="pdf",
grid=false,
legendfontsize=12,
tickfontsize=12,
guidefontsize=16,
)
Null Hypothesis is false
μ₀ = 5.0
μₜ != μ₀
simulated_statistics = [test_statistic(generate_data(Mₜ), μ₀) for _ in 1:n_sim]
distribution_under_H₁ = tnc -> pdf(NoncentralT(n - 1, (μₜ - μ₀) / √(σₜ^2 / n)), tnc)
Code
histogram(simulated_statistics; normalize=true, label="Simulated")
tmin = mean(simulated_statistics) -3*std(simulated_statistics)
tmax = mean(simulated_statistics) +3*std(simulated_statistics)
t_plot = tmin:((tmax - tmin) / 1000):tmax
plot!(distribution_under_H₁, t_plot; lw=5, label="Theoretical")
plot!(;
title="Distribution of single sample T test statistic, \n when Null Hypothesis is incorrect",
xlimits=(tmin, tmax),
xlabel="t",
ylabel="pdf",
grid=false,
legendfontsize=12,
tickfontsize=12,
guidefontsize=16,
)
Null Hypothesis is false, but assumptions don’t hold
Same counter examples as above.
Code
simulated_statistics = [test_statistic(generate_data(M_non_normal), μ₀) for _ in 1:n_sim]
histogram(simulated_statistics; normalize=true, label="Simulated")
tmin = mean(simulated_statistics) -3*std(simulated_statistics)
tmax = mean(simulated_statistics) +3*std(simulated_statistics)
t_plot = tmin:((tmax - tmin) / 1000):tmax
plot!(distribution_under_H₁, t_plot; lw=5, label="Theoretical")
Code
plot!(;
title="Distribution of single sample T test statistic, \n under H₁, but non-normal observations",
xlimits=(tmin, tmax),
xlabel="t",
ylabel="pdf",
grid=false,
legendfontsize=12,
tickfontsize=12,
guidefontsize=16,
)
Code
simulated_statistics = [test_statistic(generate_data(M_non_independent), μ₀) for _ in 1:n_sim]
histogram(simulated_statistics; normalize=true, label="Simulated")
tmin = mean(simulated_statistics) -3*std(simulated_statistics)
tmax = mean(simulated_statistics) +3*std(simulated_statistics)
t_plot = tmin:((tmax - tmin) / 1000):tmax
plot!(distribution_under_H₁, t_plot; lw=5, label="Theoretical")
plot!(;
title="Distribution of single sample T test statistic, \n under H₁, but non-independent observations",
xlimits=(tmin, tmax),
xlabel="t",
ylabel="pdf",
grid=false,
legendfontsize=12,
tickfontsize=12,
guidefontsize=16,
)
Code
simulated_statistics = [test_statistic(generate_data(M_non_identical), μ₀) for _ in 1:n_sim]
histogram(simulated_statistics; normalize=true, label="Simulated")
tmin = mean(simulated_statistics) -3*std(simulated_statistics)
tmax = mean(simulated_statistics) +3*std(simulated_statistics)
t_plot = tmin:((tmax - tmin) / 1000):tmax
plot!(distribution_under_H₁, t_plot; lw=5, label="Theoretical")
plot!(;
title="Distribution of single sample T test statistic, \n under H₁, but non-identical observations",
xlimits=(tmin, tmax),
xlabel="t",
ylabel="pdf",
grid=false,
legendfontsize=12,
tickfontsize=12,
guidefontsize=16,
)
Confidence Interval
α = 0.05 # proportion of CIs which should fail to cover true values
function confidence_interval(y, α)
n = length(y)
μ̂ = sum(y) / n
σ̂² = sum((y .- μ̂) .^ 2) / (n - 1)
L = μ̂ + quantile(TDist(n - 1), α / 2) * (√(σ̂² / n))
U = μ̂ + quantile(TDist(n - 1), 1 - α / 2) * (√(σ̂² / n))
return Interval(L, U)
end
simulated_CIs = [confidence_interval(generate_data(Mₜ), α) for _ in 1:n_sim]
coverage = count([μₜ in CI for CI in simulated_CIs]) / n_sim
Code
bar(["inside CI", "outside CI"], [coverage, 1 - coverage])
plot!(;
title="Simulated Coverage of 95% confidence interval \n corresponding to single sample T test",
ylimits=(0.0, 1.0),
yticks=[0.05, 0.95],
ylabel="proportion",
grid=true,
legend=false,
legendfontsize=12,
tickfontsize=12,
guidefontsize=16,
)
assumptions don’t hold
Same counter examples as above.
Code
simulated_CIs = [confidence_interval(generate_data(M_non_normal), α) for _ in 1:n_sim]
coverage = count([μₜ in CI for CI in simulated_CIs]) / n_sim
bar(["inside CI", "outside CI"], [coverage, 1 - coverage])
plot!(;
title="Simulated Coverage of 95% confidence interval \n corresponding to single sample T test \n but non-normal observations",
ylimits=(0.0, 1.0),
yticks=[0.05, 0.95],
ylabel="proportion",
grid=true,
legend=false,
legendfontsize=12,
tickfontsize=12,
guidefontsize=16,
#top_margin = 25Plots.px,
)
Code
simulated_CIs = [confidence_interval(generate_data(M_non_independent), α) for _ in 1:n_sim]
coverage = count([μₜ in CI for CI in simulated_CIs]) / n_sim
bar(["inside CI", "outside CI"], [coverage, 1 - coverage])
plot!(;
title="Simulated Coverage of 95% confidence interval \n corresponding to single sample T test \n but non-independent observations",
ylimits=(0.0, 1.0),
yticks=[0.05, 0.95],
ylabel="proportion",
grid=true,
legend=false,
legendfontsize=12,
tickfontsize=12,
guidefontsize=16,
)
Code
simulated_CIs = [confidence_interval(generate_data(M_non_identical), α) for _ in 1:n_sim]
coverage = count([μₜ in CI for CI in simulated_CIs]) / n_sim
bar(["inside CI", "outside CI"], [coverage, 1 - coverage])
plot!(;
title="Simulated Coverage of 95% confidence interval \n corresponding to single sample T test \n but non-identical observations",
ylimits=(0.0, 1.0),
yticks=[0.05, 0.95],
ylabel="proportion",
grid=true,
legend=false,
legendfontsize=12,
tickfontsize=12,
guidefontsize=16,
)